# Purpose:
# Replicate the one-tailed Welch t-test, bootstrap CI, and Cohen's d for
# between-group differences in the proportion of silences among SP/(SP+FP).

library(readxl)
library(dplyr)
library(tidyr)
library(boot)

input_file <- "data/change to data file name"
sheet_name <- "fixed"
output_file <- "outputs/05_silence_proportion_group_comparison.txt"

df <- read_excel(input_file, sheet = sheet_name)

required_cols <- c("tier", "text", "correct ID", "ניסיון מקצועי")
missing_cols <- setdiff(required_cols, names(df))
if (length(missing_cols) > 0) {
  stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

df_summary <- df %>%
  filter(tolower(tier) == "disfluency", tolower(text) %in% c("sp", "fp")) %>%
  mutate(text = tolower(text)) %>%
  rename(participant_id = `correct ID`, group_label = `ניסיון מקצועי`) %>%
  mutate(group = ifelse(group_label == "סטודנט תואר ראשון", "novice", "expert")) %>%
  count(participant_id, group, text) %>%
  pivot_wider(names_from = text, values_from = n, values_fill = 0) %>%
  mutate(
    sp_fp_total = sp + fp,
    prop_sp = ifelse(sp_fp_total > 0, sp / sp_fp_total, NA_real_)
  ) %>%
  filter(!is.na(prop_sp))

df_summary$group <- factor(df_summary$group, levels = c("novice", "expert"))
t_result <- t.test(prop_sp ~ group, data = df_summary, alternative = "less")

mean_diff <- function(data, indices) {
  d <- data[indices, ]
  mean(d$prop_sp[d$group == "expert"]) - mean(d$prop_sp[d$group == "novice"])
}

set.seed(123)
boot_out <- boot(data = df_summary, statistic = mean_diff, R = 10000)
boot_ci <- boot.ci(boot_out, type = "perc")

mean_nov <- mean(df_summary$prop_sp[df_summary$group == "novice"])
sd_nov <- sd(df_summary$prop_sp[df_summary$group == "novice"])
n_nov <- sum(df_summary$group == "novice")
mean_exp <- mean(df_summary$prop_sp[df_summary$group == "expert"])
sd_exp <- sd(df_summary$prop_sp[df_summary$group == "expert"])
n_exp <- sum(df_summary$group == "expert")

pooled_sd <- sqrt(((n_nov - 1) * sd_nov^2 + (n_exp - 1) * sd_exp^2) / (n_nov + n_exp - 2))
cohen_d <- (mean_exp - mean_nov) / pooled_sd

sink(output_file)
cat("Silence proportion group comparison\n\n")
cat(sprintf("Novice: M = %.3f, SD = %.3f, n = %d\n", mean_nov, sd_nov, n_nov))
cat(sprintf("Expert: M = %.3f, SD = %.3f, n = %d\n\n", mean_exp, sd_exp, n_exp))
print(t_result)
cat("\nBootstrap percentile CI for expert - novice difference:\n")
print(boot_ci)
cat("\nCohen's d:\n")
print(cohen_d)
sink()
